This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
spotify <- read.csv("Spotify_top50.csv")
str(spotify)
## 'data.frame': 50 obs. of 14 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Track.Name : chr "Se\xf1orita" "China" "boyfriend (with Social House)" "Beautiful People (feat. Khalid)" ...
## $ Artist.Name : chr "Shawn Mendes" "Anuel AA" "Ariana Grande" "Ed Sheeran" ...
## $ Genre : chr "canadian pop" "reggaeton flow" "dance pop" "pop" ...
## $ Beats.Per.Minute: int 117 105 190 93 150 102 180 111 136 135 ...
## $ Energy : int 55 81 80 65 65 68 64 68 62 43 ...
## $ Danceability : int 76 79 40 64 58 80 75 48 88 70 ...
## $ Loudness..dB.. : int -6 -4 -4 -8 -4 -5 -6 -5 -6 -11 ...
## $ Liveness : int 8 8 16 8 11 9 7 8 11 10 ...
## $ Valence. : int 75 61 70 55 18 84 23 35 64 56 ...
## $ Length. : int 191 302 186 198 175 220 131 202 157 194 ...
## $ Acousticness.. : int 4 8 12 12 45 9 2 15 5 33 ...
## $ Speechiness. : int 3 9 46 19 7 4 29 9 10 38 ...
## $ Popularity : int 79 92 85 86 94 84 92 90 87 95 ...
library(tidyverse) # for data wrangling
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
spotify <- spotify %>%
select(-X)
colSums(is.na(spotify))
## Track.Name Artist.Name Genre Beats.Per.Minute
## 0 0 0 0
## Energy Danceability Loudness..dB.. Liveness
## 0 0 0 0
## Valence. Length. Acousticness.. Speechiness.
## 0 0 0 0
## Popularity
## 0
library(plotly) # for interactive plot
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(glue) # for glue text
top10_song <- spotify %>%
arrange(desc(Popularity)) %>%
head(10) %>%
select(c(Track.Name, Artist.Name, Genre, Popularity, Length.)) %>%
mutate(mean_length = mean(Length.),
text = glue(
"Artist = {Artist.Name}
Genre = {Genre}"
))
plot_top10_song <- ggplot(data = top10_song, aes(x = reorder(Track.Name, Popularity),
y = Popularity,
text = text,
label = Popularity))+
geom_col(aes(fill = Popularity), show.legend = F)+
theme_bw()+
coord_flip()+
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 12, colour = "black"),
title = element_text(size = 12, colour = "black"))+
geom_text(aes(label = Popularity), color = "white", size = 6, fontface = "bold", position = position_stack(0.8))+
labs(title = "Top 10 Song on Spotify in 2022",
x = "Song Title",
y = "Popularity Rate",
caption = "Source : Kaggle Dataset")
ggplotly(plot_top10_song, tooltip = "text")
top3_genre <- spotify %>%
group_by(Genre) %>%
summarise(song = n()) %>%
ungroup() %>%
mutate(song = song/50) %>%
arrange(desc(song)) %>%
head(3)
library(ggplot2) #to make plot
plot_top3_genre <- ggplot(data = top3_genre, aes(x = reorder(Genre, song),
y = song,
label = song))+
geom_col(aes(fill = song), show.legend = FALSE)+
theme_bw()+
coord_flip()+
theme(axis.text = element_text(size = 12),
axis.title = element_text(size = 14, colour = "black"),
title = element_text(size = 14, colour = "black"))+
geom_text(aes(label = scales::percent(song)), color = "white", size = 12, fontface = "bold", position = position_stack(0.7))+
labs(title = "Top 3 Genre of Spotify Most Popular Song 2022",
x = "Genre of Music",
y = "Rate of Genre",
caption = "Source : Kaggle Dataset")
plot_top3_genre
spotify_ppt <- spotify %>%
select_if(is.numeric) %>%
select(-Popularity) # this variable would not be used even integer since it does not carelated to thid clasification.
glimpse(spotify_ppt)
## Rows: 50
## Columns: 9
## $ Beats.Per.Minute <int> 117, 105, 190, 93, 150, 102, 180, 111, 136, 135, 176,…
## $ Energy <int> 55, 81, 80, 65, 65, 68, 64, 68, 62, 43, 62, 71, 41, 7…
## $ Danceability <int> 76, 79, 40, 64, 58, 80, 75, 48, 88, 70, 61, 82, 50, 7…
## $ Loudness..dB.. <int> -6, -4, -4, -8, -4, -5, -6, -5, -6, -11, -5, -4, -6, …
## $ Liveness <int> 8, 8, 16, 8, 11, 9, 7, 8, 11, 10, 24, 15, 11, 6, 12, …
## $ Valence. <int> 75, 61, 70, 55, 18, 84, 23, 35, 64, 56, 24, 38, 45, 7…
## $ Length. <int> 191, 302, 186, 198, 175, 220, 131, 202, 157, 194, 251…
## $ Acousticness.. <int> 4, 8, 12, 12, 45, 9, 2, 15, 5, 33, 60, 28, 75, 7, 10,…
## $ Speechiness. <int> 3, 9, 46, 19, 7, 4, 29, 9, 10, 38, 31, 7, 3, 20, 5, 1…
spotify_scale <- scale(spotify_ppt, center = T, scale = T)
RNGkind(sample.kind = "Rounding")
## Warning in RNGkind(sample.kind = "Rounding"): non-uniform 'Rounding' sampler
## used
kmeansTunning <- function(data, maxK){
withinall <- NULL
total_k <- NULL
for (i in 2: maxK){
set.seed(101)
temp <- kmeans(data,i)$tot.withinss
withinall <- append(withinall, temp)
total_k <- append(total_k,i)
}
plot(x = total_k, y = withinall, type = "o", xlab = "Number of Cluster", ylab = "Total Within")
}
kmeansTunning(spotify_scale, maxK = 7)
set.seed(101)
spotify_cluster <- kmeans(spotify_ppt, 6)
spotify_ppt$cluster <- spotify_cluster$cluster
spotify_ppt$cluster <- as.factor(spotify_ppt$cluster)
library(FactoMineR) # for PCA
pca_spotify <- PCA(spotify_ppt, quali.sup =10, graph = F, scale.unit = T)
# plot
plot.PCA(pca_spotify, choix = "ind", label = "none", habillage = 10)
summary(pca_spotify)
##
## Call:
## PCA(X = spotify_ppt, scale.unit = T, quali.sup = 10, graph = F)
##
##
## Eigenvalues
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 Dim.7
## Variance 2.252 1.578 1.273 1.015 0.898 0.732 0.692
## % of var. 25.020 17.532 14.144 11.282 9.982 8.139 7.691
## Cumulative % of var. 25.020 42.553 56.697 67.979 77.961 86.100 93.791
## Dim.8 Dim.9
## Variance 0.335 0.224
## % of var. 3.723 2.486
## Cumulative % of var. 97.514 100.000
##
## Individuals (the 10 first)
## Dist Dim.1 ctr cos2 Dim.2 ctr cos2
## 1 | 1.886 | 0.154 0.021 0.007 | -0.310 0.122 0.027 |
## 2 | 3.269 | 2.085 3.860 0.407 | 0.189 0.045 0.003 |
## 3 | 4.937 | -0.002 0.000 0.000 | 4.103 21.336 0.691 |
## 4 | 1.874 | -0.689 0.422 0.135 | -0.050 0.003 0.001 |
## 5 | 2.816 | -0.725 0.467 0.066 | 0.039 0.002 0.000 |
## 6 | 2.102 | 1.293 1.485 0.378 | -0.314 0.125 0.022 |
## 7 | 3.624 | -1.589 2.242 0.192 | 2.276 6.565 0.394 |
## 8 | 2.364 | -0.052 0.002 0.000 | -0.098 0.012 0.002 |
## 9 | 2.182 | -0.066 0.004 0.001 | 0.394 0.196 0.033 |
## 10 | 3.905 | -3.226 9.242 0.682 | 1.025 1.333 0.069 |
## Dim.3 ctr cos2
## 1 -1.322 2.745 0.491 |
## 2 -0.135 0.028 0.002 |
## 3 2.071 6.738 0.176 |
## 4 -0.159 0.040 0.007 |
## 5 1.284 2.589 0.208 |
## 6 -1.315 2.718 0.392 |
## 7 -0.457 0.329 0.016 |
## 8 1.211 2.303 0.262 |
## 9 -1.789 5.031 0.672 |
## 10 -0.063 0.006 0.000 |
##
## Variables
## Dim.1 ctr cos2 Dim.2 ctr cos2 Dim.3 ctr
## Beats.Per.Minute | -0.231 2.375 0.053 | 0.840 44.735 0.706 | 0.082 0.522
## Energy | 0.845 31.691 0.714 | 0.339 7.303 0.115 | -0.002 0.000
## Danceability | 0.126 0.710 0.016 | -0.084 0.451 0.007 | -0.737 42.615
## Loudness..dB.. | 0.813 29.325 0.660 | 0.115 0.842 0.013 | 0.151 1.792
## Liveness | 0.371 6.127 0.138 | -0.238 3.597 0.057 | 0.578 26.252
## Valence. | 0.502 11.180 0.252 | 0.212 2.860 0.045 | -0.406 12.962
## Length. | 0.359 5.718 0.129 | 0.009 0.006 0.000 | 0.355 9.896
## Acousticness.. | -0.362 5.832 0.131 | -0.277 4.868 0.077 | 0.211 3.505
## Speechiness. | -0.398 7.042 0.159 | 0.747 35.339 0.558 | 0.177 2.456
## cos2
## Beats.Per.Minute 0.007 |
## Energy 0.000 |
## Danceability 0.542 |
## Loudness..dB.. 0.023 |
## Liveness 0.334 |
## Valence. 0.165 |
## Length. 0.126 |
## Acousticness.. 0.045 |
## Speechiness. 0.031 |
##
## Supplementary categories
## Dist Dim.1 cos2 v.test Dim.2 cos2 v.test
## cluster_1 | 1.458 | -1.042 0.511 -3.032 | 0.434 0.089 1.510 |
## cluster_2 | 2.697 | -0.191 0.005 -0.297 | 2.412 0.800 4.480 |
## cluster_3 | 2.162 | 1.820 0.708 3.135 | -0.521 0.058 -1.072 |
## cluster_4 | 1.835 | -0.370 0.041 -0.808 | -1.136 0.383 -2.966 |
## cluster_5 | 1.285 | 0.140 0.012 0.387 | -0.553 0.186 -1.828 |
## cluster_6 | 3.224 | 2.047 0.403 2.412 | 0.802 0.062 1.129 |
## Dim.3 cos2 v.test
## cluster_1 -0.532 0.133 -2.059 |
## cluster_2 1.016 0.142 2.100 |
## cluster_3 0.597 0.076 1.368 |
## cluster_4 0.886 0.233 2.575 |
## cluster_5 -0.704 0.300 -2.587 |
## cluster_6 -0.012 0.000 -0.018 |
plot.PCA(pca_spotify)
pca_dimdesc <- dimdesc(pca_spotify)
pca_dimdesc$Dim.1
## $quanti
## correlation p.value
## Energy 0.8447698 1.244339e-14
## Loudness..dB.. 0.8126227 7.751612e-13
## Valence. 0.5017403 2.055450e-04
## Liveness 0.3714510 7.910044e-03
## Length. 0.3588254 1.049896e-02
## Acousticness.. -0.3623999 9.700876e-03
## Speechiness. -0.3982108 4.182600e-03
##
## $quali
## R2 p.value
## cluster 0.4379973 8.282538e-05
##
## $category
## Estimate p.value
## cluster=cluster_3 1.419174 0.001110847
## cluster=cluster_6 1.646350 0.014255483
## cluster=cluster_1 -1.442859 0.001677372
##
## attr(,"class")
## [1] "condes" "list"
plot.PCA(pca_spotify, choix = "var", col.ind = spotify_ppt$cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(spotify_cluster,
data = spotify_ppt[,-10])+
theme_minimal()
spotify_cluster$withinss
## [1] 28670.214 8828.800 6850.333 12348.444 18546.462 4674.667
spotify_cluster$totss
## [1] 193255
spotify_cluster$betweenss
## [1] 113336.1
spotify_cluster$betweenss/spotify_cluster$totss
## [1] 0.5864587
spotify_ppt %>%
group_by(cluster) %>%
summarise_all("mean")
## # A tibble: 6 × 10
## cluster Beats…¹ Energy Dance…² Loudn…³ Liven…⁴ Valen…⁵ Length. Acous…⁶ Speec…⁷
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 142. 58 74.4 -7.07 12.3 51.2 167 19.7 14.8
## 2 2 179. 70.4 63.6 -4.8 14 55.8 222. 25 29.8
## 3 3 98.5 77 71.5 -4.5 29.8 69 227. 18.2 7.83
## 4 4 96.9 54.2 63.9 -5.89 18.8 29.9 213 14.9 6.89
## 5 5 99.3 65.4 74.8 -5.38 9.23 65.8 186. 33.5 9.46
## 6 6 124. 79.7 77.7 -3.33 7.67 65 300. 9.67 12
## # … with abbreviated variable names ¹Beats.Per.Minute, ²Danceability,
## # ³Loudness..dB.., ⁴Liveness, ⁵Valence., ⁶Acousticness.., ⁷Speechiness.